{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c94e471-9094-4aba-ab16-84ba5f7e43e6",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-07-24T06:42:16.130398Z",
     "iopub.status.busy": "2023-07-24T06:42:16.130003Z",
     "iopub.status.idle": "2023-07-24T06:42:17.822068Z",
     "shell.execute_reply": "2023-07-24T06:42:17.821704Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "# change the working directory to Build\n",
    "os.chdir(\"/Users/xiaosongw/Dropbox/Research/InformedSources/Replication/Build\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4cb8b01d-e293-468c-9583-9002507a3d4b",
   "metadata": {},
   "source": [
    "# Combine all IS prices for VIC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1d69a44b-bca9-49ce-955e-be654b3d4c40",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-07-24T06:42:17.824326Z",
     "iopub.status.busy": "2023-07-24T06:42:17.824213Z",
     "iopub.status.idle": "2023-07-24T06:42:17.828069Z",
     "shell.execute_reply": "2023-07-24T06:42:17.827758Z"
    }
   },
   "outputs": [],
   "source": [
    "df0 = pd.DataFrame()\n",
    "l_files = [i for i in os.listdir(\"./Input/\") if 'VIC_' in i]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f835366a-d63f-42e8-9945-2719823ddaef",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-07-24T06:42:17.829813Z",
     "iopub.status.busy": "2023-07-24T06:42:17.829713Z",
     "iopub.status.idle": "2023-07-24T06:48:40.864921Z",
     "shell.execute_reply": "2023-07-24T06:48:40.864467Z"
    }
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "for i in l_files:\n",
    "    tmp = pd.read_excel(\"./Input/\"+i, \n",
    "                        dtype={'Sitecode':'int', 'SiteName At Extraction Date':'str', \n",
    "                               'Address1':'str', 'Address2':'str', 'Suburb':'str', \n",
    "                               'Postcode':'int', 'FuelDescription':'str', 'Date':'str', 'AvgPrice':'float'})\n",
    "    df0 = pd.concat([df0, tmp], axis=0)\n",
    "\n",
    "df0 = df0.rename(columns={'Brand At Transaction Date':'Brand', 'SiteName At Extraction Date':'SiteName'})\n",
    "\n",
    "tmp = pd.read_excel(\"./Input/AvgUlpPricesVicSites.xlsx\", \n",
    "                    dtype={'Sitecode':'int', 'Brand':'str', 'SiteName':'str', \n",
    "                               'Address1':'str', 'Address2':'str', 'Suburb':'str', \n",
    "                               'Postcode':'int', 'FuelDescription':'str', 'Date':'str', 'AvgPrice':'float'})\n",
    "tmp['old'] = 1\n",
    "df0 = pd.concat([df0, tmp], axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3464a981-b2db-4e55-82c3-fc97f0cb48e0",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-07-24T06:48:40.869723Z",
     "iopub.status.busy": "2023-07-24T06:48:40.869596Z",
     "iopub.status.idle": "2023-07-24T06:48:41.718756Z",
     "shell.execute_reply": "2023-07-24T06:48:41.718400Z"
    }
   },
   "outputs": [],
   "source": [
    "df0 = df0.drop('Unnamed: 11', axis=1, errors='ignore')\n",
    "df0['Date'] = pd.to_datetime(df0['Date'])\n",
    "df0.columns = [i.lower() for i in df0.columns]\n",
    "df0.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13c4fb4b-00aa-48ad-9e22-90dedb05ff9e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-07-24T06:48:41.721226Z",
     "iopub.status.busy": "2023-07-24T06:48:41.721092Z",
     "iopub.status.idle": "2023-07-24T06:48:45.298253Z",
     "shell.execute_reply": "2023-07-24T06:48:45.297863Z"
    }
   },
   "outputs": [],
   "source": [
    "df_vic = df0.copy()\n",
    "df_vic.sort_values(['sitecode', 'date'], ignore_index=True, inplace=True)\n",
    "print(df_vic['brand'].unique().tolist())\n",
    "df_vic['bid'] = np.nan\n",
    "df_vic.loc[df_vic['brand']=='Caltex Woolworths', 'bid'] = 'Woolworths'\n",
    "df_vic.loc[df_vic['brand']=='Coles Express', 'bid'] = 'Coles'\n",
    "df_vic.loc[df_vic['brand']=='7 Eleven', 'bid'] = '7-Eleven'\n",
    "df_vic.loc[df_vic['brand']=='BP', 'bid'] = 'BP'\n",
    "df_vic.loc[df_vic['brand']=='Caltex', 'bid'] = 'Caltex'\n",
    "df_vic.loc[~df_vic['bid'].isin(['BP', 'Caltex', 'Coles', 'Woolworths', '7-Eleven']), 'bid'] = 'Other'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3722df87-8fa4-4da5-9d7f-5f252cb5c1e6",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-07-24T06:48:45.300560Z",
     "iopub.status.busy": "2023-07-24T06:48:45.300436Z",
     "iopub.status.idle": "2023-07-24T06:48:52.909258Z",
     "shell.execute_reply": "2023-07-24T06:48:52.908801Z"
    }
   },
   "outputs": [],
   "source": [
    "# drop duplicates\n",
    "# overlapping samples\n",
    "fig, axs = plt.subplots(figsize=[6,2])\n",
    "df_vic.groupby('date')['avgprice'].count().plot()\n",
    "plt.show()\n",
    "df_vic['_np'] = df_vic.groupby(['sitecode', 'date'])['avgprice'].transform('count')\n",
    "df_vic.loc[((df_vic['date']=='2015-08-01')|(df_vic['date']=='2017-08-01'))\n",
    "           &(df_vic['_np']>1), '_drop'] = 1\n",
    "print('on 2015-08-01 or 2017-08-01, {} stations have duplicated prices'.format(df_vic['_drop'].sum()))\n",
    "df_vic = df_vic[df_vic['_drop']!=1].sort_values(['sitecode', 'date'], ignore_index=True).copy()\n",
    "\n",
    "df_vic['_np'] = df_vic.groupby(['sitecode', 'date'])['avgprice'].transform('count')\n",
    "df_vic['_drop'] = df_vic.groupby(['sitecode', 'date'])['avgprice'].rank(method='first')\n",
    "print('sometime, when a station changes its ownership, it shows up twice in a day')\n",
    "df_vic = df_vic[df_vic['_drop']==1].sort_values(['sitecode', 'date'], ignore_index=True).copy()\n",
    "\n",
    "df_vic['_np'] = df_vic.groupby(['sitecode', 'date'])['avgprice'].transform('count')\n",
    "df_vic.drop(['old', '_np', '_drop'], axis=1, inplace=True, errors='ignore')\n",
    "\n",
    "fig, axs = plt.subplots(figsize=[6,2])\n",
    "df_vic.groupby('date')['avgprice'].count().plot()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ccee1b0e-a76e-46d3-b367-90bcdf59754d",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-07-24T06:48:52.912163Z",
     "iopub.status.busy": "2023-07-24T06:48:52.912021Z",
     "iopub.status.idle": "2023-07-24T06:49:29.222758Z",
     "shell.execute_reply": "2023-07-24T06:49:29.222015Z"
    }
   },
   "outputs": [],
   "source": [
    "df_vic.to_csv(\"./Output/is_vic_2005_2019_raw_merged.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0fe9b1e4-4871-426c-b177-800ba3f310a0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
